setwd("~/Desktop/orderbook/Research/R")
library(data.table)
library(tidyr)
library(xts)




# event size and price rounding must be ticker-dependent
event_size_by_ticker <- data.table(
  ticker = c("AAPL", "APRN", "LYFT", "TSG", "UBER"),
  size = c(100, 200, 100, 200, 100),
  price = c(1e4, 0.1e4, 0.5e4, 0.2e4, 1e4)
)


# load data ----------------------------------------------------------------------------------------------------------------
data_dir <- "~/Desktop/orderbook/Research/lobster/"
save_dir <- "~/Desktop/orderbook/Research/image_support_test/"
if (grepl("/n/", getwd())){
  save_dir <- "/n/scratchlfs/kou_lab/shihaoyang/lob/"
  data_dir <- "/n/kou_lab/shihaoyang/lob/"
}

args <- commandArgs(trailingOnly=TRUE)
if(length(args) == 0){
  # AAPL_2019-01-09 only has 10 minutes of data, linear model gives NA on many places, code breaks
  data_file_name <- "AAPL_2019-01-02"
#  data_file_name <- "LYFT_2019-05-10"
}else{
  all_files <- list.files(data_dir)
  all_files <- all_files[grep("_34200000_57600000_", all_files)]
  all_files <- strsplit(all_files, "_34200000_57600000_")
  all_files <- sort(unique(sapply(all_files, function(x) x[1])))
  data_file_name <- all_files[as.numeric(args[1])]
}
print(data_file_name)
data_ticker <- strsplit(data_file_name, "_")[[1]][[1]]

book_msg <- fread(paste0(data_dir, data_file_name, "_34200000_57600000_message_10.csv"))
book_order <- fread(paste0(data_dir, data_file_name, "_34200000_57600000_orderbook_10.csv"))

level_no <- ncol(book_order) / 4

colnames(book_msg) <- c("time", "event_type", "order_id", "size", "price", "direction", "note")
colnames(book_order) <- paste0(c("ask_price_", "ask_size_", "bid_price_", "bid_size_"), ceiling(1:ncol(book_order) / 4))

book <- copy(book_order)
sapply(book, class)

# fix integer 64 9999999999 values
for(col in names(book)) set(book, i=which(book[[col]]==9999999999), j=col, value=10000000000)
for(col in names(book)) set(book, i=which(book[[col]]==-9999999999), j=col, value=-10000000000)

#
book[, mid_price := (ask_price_1 + bid_price_1) / 2]

book[, ref_price := mid_price]
if (book$ref_price[1] %% 100 == 0){
  book$ref_price[1] <- book$ref_price[1] + 50
}
book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]

while(nrow(book[ref_price %% 100 == 0 & ref_price_change > 0]) > 0){
  book[ref_price %% 100 == 0 & ref_price_change > 0, ref_price := ref_price - 50]
  book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]
  print(nrow(book[ref_price %% 100 == 0 & ref_price_change > 0]))
}

while(nrow(book[ref_price %% 100 == 0 & ref_price_change < 0]) > 0){
  book[ref_price %% 100 == 0 & ref_price_change < 0, ref_price := ref_price + 50]
  book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]
  print(nrow(book[ref_price %% 100 == 0 & ref_price_change < 0]))
}

nrecord <- nrow(book)
stopifnot(nrecord == nrow(book_order))
stopifnot(nrecord == nrow(book_msg))
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]

book[, stopifnot(all(ref_price %% 100 == 50))]
book[, stopifnot(all(ref_price > bid_price_1))]
book[, stopifnot(all(ref_price < ask_price_1))]

# For each level, assign "q+/-" to ask/bid sizes.
for(k in 1:level_no){
  book[, paste0("q_plus_", k) := integer(nrow(book))]
  for(i in 1:k){
    book[get(paste0("ask_price_", i)) == ref_price + 50 + (k-1) * 100, 
         paste0("q_plus_", k) := get(paste0("ask_size_", i))]
  }
  for(i in setdiff(1:level_no, 1:k)){
    book[, stopifnot(all(get(paste0("ask_price_", i)) > ref_price + 50 + (k-1) * 100))]
  }
  
  book[, paste0("q_minus_", k) := integer(nrow(book))]
  for(i in 1:k){
    book[get(paste0("bid_price_", i)) == ref_price - 50 - (k-1) * 100, 
         paste0("q_minus_", k) := get(paste0("bid_size_", i))]
  }
  for(i in setdiff(1:level_no, 1:k)){
    book[, stopifnot(all(get(paste0("bid_price_", i)) < ref_price - 50 - (k-1) * 100))]
  }
}

summary(book$q_minus_1)
summary(book$q_plus_1)
barplot(colMeans(book[, mget(c(paste0("q_minus_", 10:1), paste0("q_plus_", 1:10)))]))

id <- book_msg[, which(event_type %in% c(5, 6, 7))]
# confirm event_type 5, 6, 7 does not change orderbook
id <- setdiff(id, 1)
# data.matrix will return NA. The program works by deleting data.matrix()
#stopifnot(all(data.matrix(book[id, mget(colnames(book_order))]) - 
#                data.matrix(book[id-1, mget(colnames(book_order))]) == 0))

stopifnot(all(book[id, mget(colnames(book_order))] - 
                book[id-1, mget(colnames(book_order))] == 0))
# confirmed that reference price construction is still correct if delete here 
# book in original order is orderbook shape right after the event
book <- book[-id]
book_event <- book_msg[-id]
# now book has orderbook shape right before the event. Original matching from Lobster data is after the event.
book <- cbind(book[-nrow(book)], book_event[-1])
nrecord <- nrow(book)
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]

stopifnot(book[,all(price %% 100 == 0)])
book[, modifying_queue := (book$price - book$ref_price) / 100]
book[, modifying_queue := sign(modifying_queue) * (abs(modifying_queue) + 0.5)]
book[, table(modifying_queue)]
book[!modifying_queue %in% -10:10, .N, by = modifying_queue]
# book <- book[modifying_queue %in% -10:10]
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]
stopifnot(nrecord == nrow(book))

book <- book[event_type %in% c(1,2,3,4,5)]
book[event_type == 1, event_type_coarsen := "+"]
book[event_type %in% c(2,3), event_type_coarsen := "-"]
book[event_type %in% c(4,5), event_type_coarsen := "t"]

book[, new_ref_price_after_event := c(ref_price[-1], NA)]
book[, ref_price_jump_after_event := new_ref_price_after_event - ref_price]

average_event_size <- book[,mean(size),by=modifying_queue]
average_event_size <- average_event_size[order(modifying_queue)]
predetermined_size_unit <- event_size_by_ticker[ticker == data_ticker, size]
if(length(predetermined_size_unit) != 1){
  predetermined_size_unit <- 100
}
average_event_size$V1 <- predetermined_size_unit

for(k in 1:level_no){
  book[, paste0("q_std_plus_", k) := get(paste0("q_plus_", k)) / average_event_size[modifying_queue == k, V1]]
  book[, paste0("q_std_plus_", k) := ceiling(get(paste0("q_std_plus_", k)))]
  book[, paste0("q_std_minus_", k) := get(paste0("q_minus_", k)) / average_event_size[modifying_queue == -k, V1]]
  book[, paste0("q_std_minus_", k) := ceiling(get(paste0("q_std_minus_", k)))]
}

book[, queue_and_event := paste0(sprintf("%+d", modifying_queue), "(", event_type_coarsen, ")")]

book[, bid_ask_spread := (ask_price_1 - bid_price_1)]
book[, bid_ask_spread := pmin(bid_ask_spread, 900) / 100]

book[, q_std_best_ask := ceiling(ask_size_1 / average_event_size[modifying_queue==1,V1])]
book[, q_std_best_bid := ceiling(bid_size_1 / average_event_size[modifying_queue==-1,V1])]

book[, ref_price_event := ""]
book[ref_price_jump_after_event > 0, ref_price_event := paste0("p+(", event_type_coarsen, ")")]
book[ref_price_jump_after_event < 0, ref_price_event := paste0("p-(", event_type_coarsen, ")")]
book[ref_price_jump_after_event != 0, table(ref_price_event)]
book[ref_price_jump_after_event != 0, table(ref_price_event, modifying_queue)]
# the common jump because of -, +, t, can assume to be modifying the least expensive queue
# the common jump due to + is impossible if the bid-ask spread is tight (1 tick)
# same as cancellation/trade is impossible if the queue has zero size
book[ref_price_jump_after_event == "p-(+)",stopifnot(all(direction == 1))]
book[ref_price_jump_after_event == "p+(+)",stopifnot(all(direction == -1))]

book[ref_price_jump_after_event != 0, queue_and_event := ref_price_event]
book[, sum(size), by=event_type_coarsen]


# Hawkes-Markov model for orderbook process ------------------------------------------------
event <- dcast(book[modifying_queue %in% -3:3], 
               time ~ queue_and_event, value.var = "event_type_coarsen",
               fun.aggregate = length)
cols_all <- paste0(rep(sprintf("%+d", c(1:3, -(1:3))), each=3), "(", c("+", "-", "t"), ")")
cols_all <- c(cols_all, c("p+(+)", "p+(-)", "p+(t)", "p-(+)", "p-(-)", "p-(t)"))
for(i in setdiff(cols_all, colnames(event))){
  event[, eval(i) := 0]
}
event <- event[,mget(c("time", cols_all))]

save("~/guestDesktop/Research/R/gof/AAPL_2019-01-02.RData")
